This analysis examines which academic and experiential factors predict early career success using the Education & Career Success dataset (n = 5,000).
Specifically:
The goal of this study is to provide actionable insights that can help students, educators, and academic advisors better understand which factors most strongly influence early-career salary and the likelihood of pursuing entrepreneurship. By identifying these key drivers, the findings aim to support more informed decision-making around course selection, skill development, and career planning.
df <- read_csv("../../Data/MRA_education_career_success.csv", show_col_types = FALSE)
df <- janitor::clean_names(df)
df$student_id <- NULL
df <- df %>%
mutate(
entrepreneurship = factor(entrepreneurship, levels = c("No","Yes")),
gender = factor(gender),
field_of_study = factor(field_of_study),
current_job_level = factor(current_job_level)
)
glimpse(df)
## Rows: 5,000
## Columns: 19
## $ age <dbl> 24, 21, 28, 25, 22, 24, 27, 20, 24, 28, 28, 25, …
## $ gender <fct> Male, Other, Female, Male, Male, Male, Male, Mal…
## $ high_school_gpa <dbl> 3.58, 2.52, 3.42, 2.43, 2.08, 2.40, 2.36, 2.68, …
## $ sat_score <dbl> 1052, 1211, 1193, 1497, 1012, 1600, 1011, 1074, …
## $ university_ranking <dbl> 291, 112, 715, 170, 599, 631, 610, 240, 337, 138…
## $ university_gpa <dbl> 3.96, 3.63, 2.63, 2.81, 2.48, 3.78, 3.83, 2.84, …
## $ field_of_study <fct> Arts, Law, Medicine, Computer Science, Engineeri…
## $ internships_completed <dbl> 3, 4, 4, 3, 4, 2, 0, 1, 2, 1, 2, 2, 2, 0, 1, 3, …
## $ projects_completed <dbl> 7, 7, 8, 9, 6, 3, 1, 5, 3, 5, 7, 2, 0, 4, 2, 5, …
## $ certifications <dbl> 2, 3, 1, 1, 4, 2, 3, 5, 0, 3, 5, 3, 5, 3, 3, 2, …
## $ soft_skills_score <dbl> 9, 8, 1, 10, 10, 2, 3, 5, 5, 10, 8, 2, 2, 8, 1, …
## $ networking_score <dbl> 8, 1, 9, 6, 9, 2, 3, 1, 5, 2, 1, 9, 9, 6, 8, 9, …
## $ job_offers <dbl> 5, 4, 0, 1, 4, 1, 2, 2, 2, 0, 5, 5, 2, 2, 1, 3, …
## $ starting_salary <dbl> 27200, 25000, 42400, 57400, 47600, 68400, 55500,…
## $ career_satisfaction <dbl> 4, 1, 9, 7, 9, 9, 7, 2, 2, 4, 9, 7, 9, 4, 9, 7, …
## $ years_to_promotion <dbl> 5, 1, 3, 5, 5, 2, 4, 3, 2, 2, 1, 4, 4, 3, 3, 4, …
## $ current_job_level <fct> Entry, Mid, Entry, Mid, Entry, Entry, Mid, Entry…
## $ work_life_balance <dbl> 7, 7, 7, 5, 2, 8, 3, 3, 2, 2, 2, 6, 8, 3, 6, 3, …
## $ entrepreneurship <fct> No, No, No, No, No, Yes, No, No, No, No, No, Yes…
num_cols <- names(df)[vapply(df, is.numeric, logical(1))]
GGally::ggpairs(df, columns = num_cols, aes(alpha = 0.4))
ggplot(df, aes(starting_salary)) +
geom_histogram(bins = 30, fill = "steelblue", color = "white") +
scale_x_continuous(labels = scales::label_dollar()) +
labs(title = "Distribution of Starting Salary",
x = "Starting Salary", y = "Number of Graduates") +
theme_minimal()
ggplot(df, aes(internships_completed, starting_salary)) +
geom_jitter(width = 0.25, alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE, color = "darkred") +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title = "Internships Completed vs. Starting Salary",
x = "Internships Completed", y = "Starting Salary") +
theme_minimal()
ggplot(df, aes(entrepreneurship, years_to_promotion, fill = entrepreneurship)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Years to Promotion by Entrepreneurship Status",
x = "Entrepreneurship", y = "Years to Promotion") +
theme_minimal()
ggplot(df, aes(university_ranking, starting_salary)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = FALSE, color = "steelblue") +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title = "Starting Salary vs. University Ranking",
x = "University Ranking (lower = better)", y = "Starting Salary") +
theme_minimal()
ggplot(df, aes(gender, starting_salary, fill = gender)) +
geom_boxplot(alpha = 0.7, outlier.alpha = 0.3) +
scale_y_continuous(labels = scales::label_dollar()) +
labs(title = "Starting Salary by Gender",
x = "Gender", y = "Starting Salary") +
theme_minimal() +
theme(legend.position = "none")
ggplot(df, aes(job_offers)) +
geom_bar(fill = "coral", alpha = 0.8) +
labs(title = "Distribution of Number of Job Offers",
x = "Job Offers", y = "Number of Graduates") +
theme_minimal()
ggplot(df, aes(entrepreneurship, career_satisfaction, fill = entrepreneurship)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Career Satisfaction by Entrepreneurship Status",
x = "Entrepreneurship", y = "Career Satisfaction (1–10)") +
theme_minimal() +
theme(legend.position = "none")
train_idx <- createDataPartition(df$starting_salary, p = 0.8, list = FALSE)
train <- df[train_idx, ]
test <- df[-train_idx, ]
lm_base <- lm(starting_salary ~ university_gpa + internships_completed +
certifications + soft_skills_score + networking_score +
field_of_study, data = train)
lm_full <- lm(starting_salary ~ . -entrepreneurship, data = train)
lm_step <- MASS::stepAIC(lm_full, trace = 0) # AIC‑optimized model
compare_performance(lm_base, lm_step, rank = TRUE)
## # Comparison of Model Performance Indices
##
## Name | Model | R2 | R2 (adj.) | RMSE | Sigma | AIC weights
## --------------------------------------------------------------------------
## lm_step | lm | 0.003 | 0.002 | 14496.693 | 14503.943 | 1.000
## lm_base | lm | 0.003 | -1.888e-04 | 14500.347 | 14522.135 | 1.22e-04
##
## Name | AICc weights | BIC weights | Performance-Score
## --------------------------------------------------------
## lm_step | 1.000 | 1.000 | 100.00%
## lm_base | 1.18e-04 | 1.42e-15 | 0.00%
model_parameters(lm_step)
## Parameter | Coefficient | SE | 95% CI | t(3998) | p
## --------------------------------------------------------------------------------------
## (Intercept) | 49943.31 | 651.79 | [48665.44, 51221.18] | 76.62 | < .001
## university ranking | 1.56 | 0.79 | [ 0.02, 3.11] | 1.99 | 0.047
## internships completed | 315.66 | 162.83 | [ -3.58, 634.89] | 1.94 | 0.053
## job offers | -303.24 | 134.42 | [ -566.77, -39.70] | -2.26 | 0.024
rmse_val <- RMSE(predict(lm_step, test), test$starting_salary)
cat("Test RMSE:", round(rmse_val, 2), "\n")
## Test RMSE: 14414.69
par(mfrow = c(2,2)); plot(lm_step); par(mfrow = c(1,1))
glm_full <- glm(entrepreneurship ~ . -starting_salary, data = train, family = binomial)
glm_step <- MASS::stepAIC(glm_full, trace = 0)
model_parameters(glm_step, exponentiate = TRUE)
## Parameter | Odds Ratio | SE | 95% CI | z | p
## -------------------------------------------------------------------------
## (Intercept) | 0.14 | 0.05 | [0.07, 0.27] | -5.85 | < .001
## gender [Male] | 0.95 | 0.08 | [0.81, 1.11] | -0.64 | 0.524
## gender [Other] | 0.64 | 0.15 | [0.39, 0.99] | -1.93 | 0.053
## high school gpa | 1.15 | 0.08 | [1.01, 1.32] | 2.09 | 0.037
## sat score | 1.00 | 1.94e-04 | [1.00, 1.00] | 1.46 | 0.145
## soft skills score | 0.98 | 0.01 | [0.95, 1.00] | -1.80 | 0.072
probs <- predict(glm_step, test, type = "response")
thresholds <- seq(0.1, 0.9, by = 0.1)
metrics <- map_dfr(thresholds, function(t) {
pred <- factor(ifelse(probs >= t, "Yes", "No"), levels = c("No","Yes"))
cm <- confusionMatrix(pred, test$entrepreneurship, positive = "Yes")
tibble(threshold = t,
Accuracy = cm$overall["Accuracy"],
Sensitivity = cm$byClass["Sensitivity"],
Specificity = cm$byClass["Specificity"])
})
print(metrics)
## # A tibble: 9 × 4
## threshold Accuracy Sensitivity Specificity
## <dbl> <dbl> <dbl> <dbl>
## 1 0.1 0.197 1 0
## 2 0.2 0.461 0.579 0.432
## 3 0.3 0.803 0 1
## 4 0.4 0.803 0 1
## 5 0.5 0.803 0 1
## 6 0.6 0.803 0 1
## 7 0.7 0.803 0 1
## 8 0.8 0.803 0 1
## 9 0.9 0.803 0 1
metrics %>%
pivot_longer(-threshold) %>%
ggplot(aes(threshold, value, color = name)) +
geom_line() + geom_point() +
labs(title = "Threshold Tuning Metrics",
x = "Probability Threshold", y = "Metric Value") +
theme_minimal()
best_t <- metrics$threshold[which.max(metrics$Sensitivity + metrics$Specificity)]
cat("Optimal threshold:", best_t, "\n")
## Optimal threshold: 0.2
pred_final <- factor(ifelse(probs >= best_t, "Yes", "No"), levels = c("No","Yes"))
conf_final <- confusionMatrix(pred_final, test$entrepreneurship, positive = "Yes")
print(conf_final)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 346 83
## Yes 455 114
##
## Accuracy : 0.4609
## 95% CI : (0.4296, 0.4924)
## No Information Rate : 0.8026
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0062
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5787
## Specificity : 0.4320
## Pos Pred Value : 0.2004
## Neg Pred Value : 0.8065
## Prevalence : 0.1974
## Detection Rate : 0.1142
## Detection Prevalence : 0.5701
## Balanced Accuracy : 0.5053
##
## 'Positive' Class : Yes
##
Salary: I compared a theory‑driven baseline model to a full model, then used stepwise AIC to identify a more parsimonious linear model. Test RMSE was approximately $14,415, indicating a reasonable degree of predictive accuracy.
Entrepreneurship: I fit a full logistic regression model, applied stepwise AIC for model selection, and tuned the classification threshold (optimal ≈ 0.2) to balance sensitivity and specificity. This yielded a test accuracy of about 0.461.
These results highlight how practical experience and interpersonal development boost early‑career salary, while strong academics and a satisfaction‑driven mindset may steer graduates toward entrepreneurial paths.
Building on these insights:
Students:
Prioritize internships and soft‑skill development to boost early‑career
earnings; nurture satisfaction and academic excellence if
entrepreneurship is a goal.
Advisors and Universities:
Facilitate practical experiences and communication training, especially
for non‑STEM majors, to narrow salary gaps and support entrepreneurial
pathways.